#!/usr/bin/perl
#
# Generation of types and lookup tables for Daitch-Mokotoff soundex.
#
# Copyright (c) 2023, PostgreSQL Global Development Group
#
# This module was originally sponsored by Finance Norway /
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
#

use strict;
use warnings;

die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
my $output_file = $ARGV[0];

# Open the output file
open my $OUTPUT, '>', $output_file
  or die "Could not open output file $output_file: $!\n";

# Parse code table and generate tree for letter transitions.
my %codes;
my $table = [ {}, [ [ "", "", "" ] ] ];
while (<DATA>)
{
	chomp;
	my ($letters, $codes) = split(/\s+/);
	my @codes = map { [ split(/,/) ] } split(/\|/, $codes);

	my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
	my $val = join(
		",\n",
		map {
			"\t{\n\t\t"
			  . join(", ", map { "\"$_\"" } @$_) . "\n\t}"
		} @codes);
	$codes{$key} = $val;

	for my $letter (split(/,/, $letters))
	{
		my $ref = $table->[0];
		# Link each character to the next in the letter combination.
		my @c = split(//, $letter);
		my $last_c = pop(@c);
		for my $c (@c)
		{
			$ref->{$c} //= [ {}, undef ];
			$ref->{$c}[0] //= {};
			$ref = $ref->{$c}[0];
		}
		# The sound code for the letter combination is stored at the last character.
		$ref->{$last_c}[1] = $key;
	}
}
close(DATA);

print $OUTPUT <<EOF;
/*
 * Constants and lookup tables for Daitch-Mokotoff Soundex
 *
 * Copyright (c) 2023, PostgreSQL Global Development Group
 *
 * This file is generated by daitch_mokotoff_header.pl
 */

/* Coding chart table: Soundex codes */
typedef char dm_code[2 + 1];	/* One or two sequential code digits + NUL */
typedef dm_code dm_codes[3];	/* Start of name, before a vowel, any other */

/* Coding chart table: Letter in input sequence */
struct dm_letter
{
	char		letter;			/* Present letter in sequence */
	const struct dm_letter *letters;	/* List of possible successive letters */
	const dm_codes *codes;		/* Code sequence(s) for complete sequence */
};

typedef struct dm_letter dm_letter;

/* Codes for letter sequence at start of name, before a vowel, and any other. */
EOF

for my $key (sort keys %codes)
{
	print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
	  . $codes{$key}
	  . "\n};\n";
}

print $OUTPUT <<EOF;

/* Coding for alternative following letters in sequence. */
EOF

sub hash2code
{
	my ($ref, $letter) = @_;

	my @letters = ();

	my $h = $ref->[0];
	for my $key (sort keys %$h)
	{
		$ref = $h->{$key};
		my $children = "NULL";
		if (defined $ref->[0])
		{
			$children = "letter_$letter$key";
			hash2code($ref, "$letter$key");
		}
		my $codes = $ref->[1] // "NULL";
		push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
	}

	print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
	for (@letters)
	{
		print $OUTPUT "$_,\n";
	}
	print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
	print $OUTPUT "};\n";
}

hash2code($table, '');

close $OUTPUT;

# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
#
# The conversion from the coding chart to the table should be self
# explanatory, but note the differences stated below.
#
# X = NC (not coded)
#
# The non-ASCII letters in the coding chart are coded with substitute
# lowercase ASCII letters, which sort after the uppercase ASCII letters:
#
# Ą => a (use '[' for table lookup)
# Ę => e (use '\\' for table lookup)
# Ţ => t (use ']' for table lookup)
#
# The rule for "UE" does not correspond to the coding chart, however
# it is used by all other known implementations, including the one at
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
#
# Note that the implementation assumes that vowels are assigned code
# 0 or 1. "J" can be either a vowel or a consonant.
#

__DATA__
AI,AJ,AY				0,1,X
AU						0,7,X
a						X,X,6|X,X,X
A						0,X,X
B						7,7,7
CHS						5,54,54
CH						5,5,5|4,4,4
CK						5,5,5|45,45,45
CZ,CS,CSZ,CZS			4,4,4
C						5,5,5|4,4,4
DRZ,DRS					4,4,4
DS,DSH,DSZ				4,4,4
DZ,DZH,DZS				4,4,4
D,DT					3,3,3
EI,EJ,EY				0,1,X
EU						1,1,X
e						X,X,6|X,X,X
E						0,X,X
FB						7,7,7
F						7,7,7
G						5,5,5
H						5,5,X
IA,IE,IO,IU				1,X,X
I						0,X,X
J						1,X,X|4,4,4
KS						5,54,54
KH						5,5,5
K						5,5,5
L						8,8,8
MN						66,66,66
M						6,6,6
NM						66,66,66
N						6,6,6
OI,OJ,OY				0,1,X
O						0,X,X
P,PF,PH					7,7,7
Q						5,5,5
RZ,RS					94,94,94|4,4,4
R						9,9,9
SCHTSCH,SCHTSH,SCHTCH	2,4,4
SCH						4,4,4
SHTCH,SHCH,SHTSH		2,4,4
SHT,SCHT,SCHD			2,43,43
SH						4,4,4
STCH,STSCH,SC			2,4,4
STRZ,STRS,STSH			2,4,4
ST						2,43,43
SZCZ,SZCS				2,4,4
SZT,SHD,SZD,SD			2,43,43
SZ						4,4,4
S						4,4,4
TCH,TTCH,TTSCH			4,4,4
TH						3,3,3
TRZ,TRS					4,4,4
TSCH,TSH				4,4,4
TS,TTS,TTSZ,TC			4,4,4
TZ,TTZ,TZS,TSZ			4,4,4
t						3,3,3|4,4,4
T						3,3,3
UI,UJ,UY,UE				0,1,X
U						0,X,X
V						7,7,7
W						7,7,7
X						5,54,54
Y						1,X,X
ZDZ,ZDZH,ZHDZH			2,4,4
ZD,ZHD					2,43,43
ZH,ZS,ZSCH,ZSH			4,4,4
Z						4,4,4
